First, let’s load in and clean the data.
library(readxl)
edu_wide <- read_excel("/Users/zachheinemann/Documents/Education.xls", skip=5)
edu_wide <- edu_wide[,-grep("x_", names(edu_wide))]
edu_wide <- as.data.frame(edu_wide)
# Convert from wide to long
library(reshape)
edu_long <- stats::reshape(edu_wide,
varying = names(edu_wide)[4:23],
direction = "long",
idvar = c("fips_code"),
timevar = "year",
sep = "_")
# Identify states and counties
edu_long$area_type <- ifelse(as.numeric(substr(edu_long$fips_code,3,5))!=0,"county","state")
edu_long$area_type <- ifelse(as.numeric(substr(edu_long$fips_code,1,5))==0,"country",edu_long$area_type)
state <- edu_long[edu_long$area_type == "state", ]
US <- edu_long[edu_long$area_type == "country", ]
statestack <- data.frame(state = state$state, year = state$year, stack(state, selec = lessHS:coll))
statestack$ind <- as.character(statestack$ind)
USstack <- data.frame(US = US$state, year = US$year, stack(US, selec = lessHS:coll))
USstack$ind <- as.character(USstack$ind)
statestack$ind[statestack$ind == "coll"] <- "College"
statestack$ind[statestack$ind == "HS"] <- "High School"
statestack$ind[statestack$ind == "somecoll"] <- "Some College"
statestack$ind[statestack$ind == "lessHS"] <- "Less Than High School"
names(statestack)[1] <- "State"
names(statestack)[2] <- "Year"
names(statestack)[3] <- "Percentage"
names(statestack)[4] <- "Type"
USstack$ind[USstack$ind == "coll"] <- "College"
USstack$ind[USstack$ind == "HS"] <- "High School"
USstack$ind[USstack$ind == "somecoll"] <- "Some College"
USstack$ind[USstack$ind == "lessHS"] <- "Less Than High School"
names(USstack)[1] <- "US"
names(USstack)[2] <- "Year"
names(USstack)[3] <- "Percentage"
names(USstack)[4] <- "Type"
Now we can begin with the overall United States data.
library(ggthemes)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:reshape':
##
## rename
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot <- ggplot(USstack, aes(x = Year, y = Percentage, colour = Type)) + geom_smooth(method = loess, se = TRUE) + theme_tufte() + facet_grid(.~Type) + ylab("Percentage") + xlab("Year") + labs(colour = "Education Received") + ggtitle("US Percentage Educated Over Time By Education Type") + theme(legend.position="none")
ggplotly(plot)
However, this does not include many time points and is not all that informative. We can instead use an aggregate of the state level data [which should theoretically be similar to the overall US data based on the calculations].
plot <- ggplot(statestack, aes(x = Year, y = Percentage, colour = Type)) + geom_smooth(method = loess, se = TRUE) + theme_tufte() + ylab("Percentage") + xlab("Year") + labs(colour = "Education Received") + ggtitle("State Percentage Educated Over Time By Education Type")
ggplotly(plot)
This still fails to look at individual trends over states. While these next few graphs will be quite cumbersome, the interactive nature makes it easier for the user to discern some information from the graphs.
plot <- ggplot(statestack[statestack$Type == "College",], aes(x = Year, y = Percentage, colour = State)) + geom_line() + theme_tufte() + ylab("Percentage") + xlab("Year") + theme(legend.position="none") + ggtitle("Percentage College Educated Over Time By State")
ggplotly(plot)
plot <- ggplot(statestack[statestack$Type == "College",], aes(x = Year, y = Percentage, colour = State)) + geom_line() + theme_tufte() + ylab("Percentage") + xlab("Year") + theme(legend.position="none") + ggtitle("Percentage College Educated Over Time By State")
ggplotly(plot)
plot <- ggplot(statestack[statestack$Type == "High School",], aes(x = Year, y = Percentage, colour = State)) + geom_line() + theme_tufte() + ylab("Percentage") + xlab("Year") + theme(legend.position="none") + ggtitle("Percentage High School Educated Over Time By State")
ggplotly(plot)
plot <- ggplot(statestack[statestack$Type == "Less Than High School",], aes(x = Year, y = Percentage, colour = State)) + geom_line() + theme_tufte() + ylab("Percentage") + xlab("Year") + theme(legend.position="none") + ggtitle("Percentage Less Than High School Educated Over Time By State")
ggplotly(plot)
plot <- ggplot(statestack[statestack$Type == "Some College",], aes(x = Year, y = Percentage, colour = State)) + geom_line() + theme_tufte() + ylab("Percentage") + xlab("Year") + theme(legend.position="none") + ggtitle("Percentage Some College Educated Over Time By State")
ggplotly(plot)
However, perhaps the most informative would be an ordered dot plot which allows users to look at the percentages over time. For the sake of simplicity, this graph will focus on a particularly dynamic variable – Less than High School Educaiton.
statestack <- statestack[order(statestack[,2], statestack[,3]),]
p <- ggplot(statestack[statestack$Type == "Less Than High School",], aes(x=Percentage, y= State, color= Percentage)) + geom_point() + scale_size_continuous(range = c(1,10)) + facet_wrap(~Year) + scale_colour_gradient(low = "light blue", high = "dark blue") + theme_tufte() + theme(legend.position="none", axis.text.y = element_blank(), axis.ticks.y = element_blank()) + ggtitle("Percentage Less Than High School Educated Over Time By State") + ylab("State")
p <- ggplotly(p, tooltip = c("x","y"))
p
For this part, we will use the ecdf function, which gives the expected cumulative distribution of a vector, which can be used to put the vector back into the function and get the percentiles that each item in the vector falls into with reference to the other items. Then, for the sake of ease, we can round to two decimal places, which is often the case with percentiles.
edu_2015 <- edu_long[edu_long$year == 2015, ]
counties <- edu_2015[edu_2015$area_type == "county", ]
f <- ecdf(counties$lessHS)
counties$lessHSP <- f(counties$lessHS) * 100
f <- ecdf(counties$HS)
counties$HSP <- f(counties$HS) * 100
f <- ecdf(counties$somecoll)
counties$somecollP <- f(counties$somecoll) * 100
f <- ecdf(counties$coll)
counties$collP <- f(counties$coll) * 100
counties <- counties[,c(2:3,10:13)]
counties[,3:6] <- round(counties[,3:6], digits = 2)
rownames(counties) <- c()
library(DT)
datatable(counties)
For this part, I will create two state graphs that can be used for comparisons [for the sake of ease, zooming, and the fact that these two states are on the opposite side of the country, we will graph them separately]. The states selected are California and Pennsylvania for a few reasons. First, I am from PA and it is known for having tremendous disparities in education (specifically it often referred to as Philadelphia and Pittsburgh with Alabama in between) and voted red in the last election (a measure typically associated with education levels). We can contrast this with a state that is known for having similar educaiton disparities (specifically in the center and the northern most end of the state), but consistently votes blue in general elections. Ideally, we could overlay this data with voting data in order to make the visualization the most informative, but this data is not available (or at least I don’t have it).
library(leaflet)
library(tigris)
##
## Attaching package: 'tigris'
## The following object is masked from 'package:graphics':
##
## plot
#get CA counties shapefile
counties_CA <- counties(state="California")
#filter CA education data
CA <- filter(edu_2015, state == "CA")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:reshape':
##
## rename
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
combined <- counties_CA@data %>% left_join(CA, by = c("NAMELSAD" = "area_name"))
counties_CA@data <- combined
leaflet(counties_CA) %>%
setView(lat=38, lng=-120 , zoom=5) %>%
addTiles(group = "OpenStreetMap") %>%
addProviderTiles(providers$Stamen.Toner, group = "Toner") %>%
addProviderTiles(providers$Stamen.TonerLite, group = "Toner Lite") %>%
addPolygons(group = "College",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorNumeric("Blues", coll)(coll),
popup = paste("County:",counties_CA$NAMELSAD,"<br/>",
"Percent With Less than High School Degree:", counties_CA$lessHS, "<br/>",
"Percent with High School Degree:", counties_CA$HS, "<br/r>",
"Percent with Some College Completed:", counties_CA$somecoll, "<br/r>",
"Percent with College Degree:", counties_CA$coll, "<br/r>")) %>%
addPolygons(group="Less than High School",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorNumeric("Blues", lessHS)(lessHS),
popup = paste("County:",counties_CA$NAMELSAD,"<br/>",
"Percent With Less than High School Degree:", counties_CA$lessHS, "<br/>",
"Percent with High School Degree:", counties_CA$HS, "<br/r>",
"Percent with Some College Completed:", counties_CA$somecoll, "<br/r>",
"Percent with College Degree:", counties_CA$coll, "<br/r>")) %>%
addPolygons(group="High School",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorNumeric("Blues", HS)(lessHS),
popup = paste("County:",counties_CA$NAMELSAD,"<br/>",
"Percent With Less than High School Degree:", counties_CA$lessHS, "<br/>",
"Percent with High School Degree:", counties_CA$HS, "<br/r>",
"Percent with Some College Completed:", counties_CA$somecoll, "<br/r>",
"Percent with College Degree:", counties_CA$coll, "<br/r>")) %>%
addPolygons(group="Some College",
stroke = FALSE, fillOpacity = 0.5, smoothFactor = 0.5,
color = ~colorNumeric("Blues", somecoll)(somecoll),
popup = paste("County:",counties_CA$NAMELSAD,"<br/>",
"Percent With Less than High School Degree:", counties_CA$lessHS, "<br/>",
"Percent with High School Degree:", counties_CA$HS, "<br/r>",
"Percent with Some College Completed:", counties_CA$somecoll, "<br/r>",
"Percent with College Degree:", counties_CA$coll, "<br/r>")) %>%
addLayersControl(
baseGroups = c("OpenStreetMap", "Toner", "Toner Lite"),
overlayGroups = c("Less than High School","High School", "Some College", "College"),
options = layersControlOptions(collapsed = TRUE) )